#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
const char* buffer_convert_subgroup_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
"// convert data from buffer(nhwc) to buffer(nc16hw16) float input\n"
"__kernel void nhwc_buffer_to_nc16hw16_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const INPUT_TYPE *input_ptr,\n"
" __private const int height,\n"
" __private const int width,__private const int channels,\n"
" __global OUTPUT_TYPE *output,\n"
" __private const int input_pad_left,__private const int input_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" int image_width_idx=get_global_id(0);\n"
" int image_height_idx=get_global_id(1);\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int batch_idx=image_height_idx/height;\n"
" const int height_idx=image_height_idx % height;\n"
" const int width_idx=image_width_idx % width;\n"
" const int channel_16_idx=(image_width_idx/width) << 4;\n"
" const int buffer_offset=((batch_idx*height+height_idx)*width+width_idx)*channels+channel_16_idx;\n"
" const int remain_channel=min(channels-channel_16_idx,16);\n"
" INPUT_TYPE16 values=0;\n"
" INPUT_TYPE* values_ptr=(INPUT_TYPE*)(&values);\n"
" __global const INPUT_TYPE *input_current_ptr=input_ptr+buffer_offset;\n"
" for(int i=0; i<remain_channel; ++i){\n"
" values_ptr[i]=*(input_current_ptr+i);\n"
" }\n"
" const int out_offset=(((batch_idx*((channels+15)/16)+channel_16_idx/16)*height+height_idx)*(output_pad_left+width+output_pad_right)+width_idx+output_pad_left)*16;\n"
" vstore16(CONVERT_OUTPUT16(values),0,output+out_offset);\n"
" if(width_idx == 0){\n"
" int pad_offset=(((batch_idx*((channels+15)/16)+channel_16_idx/16)*height+height_idx)*(output_pad_left+width+output_pad_right))*16;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore16((OUTPUT_TYPE16)0,0,output+pad_offset+i*16);\n"
" }\n"
" pad_offset += (output_pad_right+width)*16;\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore16((OUTPUT_TYPE16)0,0,output+pad_offset+i*16);\n"
" }\n"
" }\n"
"}\n"
"// convert data from buffer(nchw) to buffer(nc16hw16)\n"
"__kernel void nchw_buffer_to_nc16hw16_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const INPUT_TYPE *input_ptr,\n"
" __private const int height,__private const int width,__private const int channels,\n"
" __global OUTPUT_TYPE *output,\n"
" __private const int input_pad_left,__private const int input_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" int image_width_idx=get_global_id(0);\n"
" int image_height_idx=get_global_id(1);\n"
" \n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int src_width=width+input_pad_left+input_pad_right;\n"
" const int dst_width=width+output_pad_left+output_pad_right;\n"
" const int batch_idx=image_height_idx/height;\n"
" const int height_idx=image_height_idx % height;\n"
" const int width_idx=image_width_idx % width;\n"
" const int channel_16_idx=image_width_idx/width << 4;\n"
" const int buffer_offset=((batch_idx*channels+channel_16_idx)*height+height_idx)*src_width+width_idx+input_pad_left;\n"
" const int remain_channel=min(channels-channel_16_idx,16);\n"
" const int height_width_size=height*width;\n"
" INPUT_TYPE16 output_values=0;\n"
" INPUT_TYPE *output_values_ptr=(INPUT_TYPE*)(&output_values);\n"
" for(int i=0; i<remain_channel; ++i){\n"
" output_values_ptr[i]=*(input_ptr+buffer_offset+height_width_size*i);\n"
" }\n"
" if(width_idx == 0){\n"
" int pad_offset=(((batch_idx*((channels+15)/16)+channel_16_idx/16)*height+height_idx)*dst_width+0)*16;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore16((OUTPUT_TYPE16)0,0,output+pad_offset+16*i);\n"
" }\n"
" pad_offset += 16*(width+output_pad_left);\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore16((OUTPUT_TYPE16)0,0,output+pad_offset+16*i);\n"
" }\n"
" }\n"
" const int out_offset=(((batch_idx*((channels+15)/16)+channel_16_idx/16)*height+height_idx)*dst_width+width_idx+output_pad_left)*16;\n"
" vstore16(CONVERT_OUTPUT16(output_values),0,output+out_offset);\n"
"}\n"
"// convert data from image(b h,ic/16 w ic16) to buffer(nhwc)\n"
"__kernel void nc16hw16_buffer_to_nhwc_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global OUTPUT_TYPE *output,\n"
" __private const int height,__private const int width,\n"
" __private const int channels,\n"
" __global INPUT_TYPE *input_ptr,\n"
" __private const int input_pad_left,__private const int input_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" int image_width_idx=get_global_id(0);\n"
" int image_height_idx=get_global_id(1);\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int batch_idx=image_height_idx/height;\n"
" const int height_idx=image_height_idx % height;\n"
" const int width_idx=image_width_idx % width;\n"
" const int channel_16_idx=(image_width_idx/width) << 4;\n"
" const int buffer_offset=((batch_idx*height+height_idx)*width+width_idx)*channels+channel_16_idx;\n"
" const int in_offset=(((batch_idx*((channels+15)/16)+channel_16_idx/16)*height+height_idx)*(input_pad_left+width+input_pad_right)+width_idx+input_pad_left)*16;\n"
" INPUT_TYPE16 values=vload16(0,input_ptr+in_offset);\n"
" INPUT_TYPE* values_ptr=(INPUT_TYPE*)(&values);\n"
" const int remain_channel=min(channels-channel_16_idx,16);\n"
" for(int i=0; i<remain_channel; ++i){\n"
" output[buffer_offset+i]=(OUTPUT_TYPE)values_ptr[i];\n"
" }\n"
"}\n"
"// convert data from buffer(nc16hw16) to buffer(nchw)\n"
"__kernel void nc16hw16_buffer_to_nchw_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global OUTPUT_TYPE *output,\n"
" __private const int height,__private const int width,\n"
" __private const int channels,\n"
" __global INPUT_TYPE *input_ptr,\n"
" __private const int input_pad_left,__private const int input_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right) {\n"
" int image_width_idx=get_global_id(0);\n"
" int image_height_idx=get_global_id(1);\n"
" \n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" \n"
" const int src_width=width+input_pad_left+input_pad_right;\n"
" const int batch_idx=image_height_idx/height;\n"
" const int height_idx=image_height_idx % height;\n"
" const int width_idx=image_width_idx % width;\n"
" int channel_16_idx=(image_width_idx/width) << 4;\n"
" int buffer_offset=((batch_idx*channels+channel_16_idx)*height+height_idx)*width+width_idx;\n"
" \n"
" const int in_offset=(((batch_idx*((channels+15)/16)+channel_16_idx/16)*height+height_idx)*src_width+width_idx+input_pad_left)*16;\n"
" INPUT_TYPE16 values=vload16(0,input_ptr+in_offset);\n"
" INPUT_TYPE *values_ptr=(INPUT_TYPE*)(&values);\n"
" const int height_width_size=height*width;\n"
" const int remain_channel=min(channels-channel_16_idx,16);\n"
" for(int i=0; i<remain_channel; ++i){\n"
" output[buffer_offset+i*height_width_size]=(OUTPUT_TYPE)values_ptr[i];\n"
" }\n"
"}\n"
"__kernel void nc4hw4_buffer_to_nc16hw16_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const INPUT_TYPE *input_ptr,\n"
" __private const int2 output_shape,\n"
" __private const int2 src_stride,\n"
" __private const int2 dst_stride,\n"
" __global OUTPUT_TYPE *output,\n"
" __private const int input_pad_left,\n"
" __private const int input_pad_right,\n"
" __private const int output_pad_left,\n"
" __private const int output_pad_right,\n"
" __private const int channelc4\n"
") {\n"
" int image_width_idx=get_global_id(0);\n"
" int image_height_idx=get_global_id(1);\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int batch_idx=image_height_idx/output_shape.x;\n"
" const int height_idx=image_height_idx % output_shape.x;\n"
" const int width_idx=image_width_idx % output_shape.y;\n"
" const int channel_block_idx=image_width_idx/output_shape.y;\n"
" const int in_channel_block_idx=channel_block_idx << 2;\n"
" const int dst_width=output_pad_left+output_shape.y+output_pad_right;\n"
" int2 src_bc_offset=src_stride*(int2)(batch_idx,in_channel_block_idx);\n"
" int2 dst_bc_offset=dst_stride*(int2)(batch_idx,channel_block_idx);\n"
" int src_buffer_offset =\n"
" (((src_bc_offset.x+src_bc_offset.y)*output_shape.x+height_idx)*output_shape.y+width_idx)*4;\n"
" int dst_buffer_offset =\n"
" (((dst_bc_offset.x+dst_bc_offset.y)*output_shape.x+height_idx)*dst_width+width_idx+output_pad_left)*16;\n"
" int width_height_size4=output_shape.x*output_shape.y*4;\n"
" INPUT_TYPE4 values0=vload4(0,input_ptr+src_buffer_offset);\n"
" INPUT_TYPE4 values1=in_channel_block_idx+1 >= src_bc_offset.x ? (INPUT_TYPE4)0 : vload4(0,input_ptr+src_buffer_offset+width_height_size4);\n"
" INPUT_TYPE4 values2=in_channel_block_idx+2 >= src_bc_offset.x ? (INPUT_TYPE4)0 : vload4(0,input_ptr+src_buffer_offset+width_height_size4*2);\n"
" INPUT_TYPE4 values3=in_channel_block_idx+3 >= src_bc_offset.x ? (INPUT_TYPE4)0 : vload4(0,input_ptr+src_buffer_offset+width_height_size4*3);\n"
" \n"
" vstore16(CONVERT_OUTPUT16((INPUT_TYPE16)(values0.s0123,values1.s0123,values2.s0123,values3.s0123)),0,output+dst_buffer_offset);\n"
" if(width_idx == 0){\n"
" int pad_offset=(((dst_bc_offset.x+dst_bc_offset.y)*output_shape.x+height_idx)*dst_width)*16;\n"
" for(int i=0; i<output_pad_left; ++i){\n"
" vstore16((OUTPUT_TYPE16)0,0,output+pad_offset+16*i);\n"
" }\n"
" pad_offset += 16*(output_shape.y+output_pad_left);\n"
" for(int i=0; i<output_pad_right; ++i){\n"
" vstore16((OUTPUT_TYPE16)0,0,output+pad_offset+16*i);\n"
" }\n"
" }\n"
"}\n"
"__kernel void nc16hw16_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
" __global const INPUT_TYPE *input_ptr,\n"
" __private const int2 output_shape,\n"
" __private const int2 src_stride,\n"
" __private const int2 dst_stride,\n"
" __global OUTPUT_TYPE *output,\n"
" __private const int input_pad_left,__private const int input_pad_right,\n"
" __private const int output_pad_left,__private const int output_pad_right,\n"
" __private const int channelc4\n"
") {\n"
" int image_width_idx=get_global_id(0);\n"
" int image_height_idx=get_global_id(1);\n"
" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
" const int batch_idx=image_height_idx/output_shape.x;\n"
" const int height_idx=image_height_idx % output_shape.x;\n"
" const int width_idx=image_width_idx % output_shape.y;\n"
" const int channel_block_idx=image_width_idx/output_shape.y;\n"
" const int out_channel_block_idx=channel_block_idx << 2;\n"
" int2 src_bc_offset=src_stride*(int2)(batch_idx,channel_block_idx);\n"
" int2 dst_bc_offset=dst_stride*(int2)(batch_idx,out_channel_block_idx);\n"
" int width_height_size4=output_shape.x*output_shape.y*4;\n"
" int src_buffer_offset =\n"
" (((src_bc_offset.x+src_bc_offset.y)*output_shape.x+height_idx)*(input_pad_left+output_shape.y+input_pad_right)+width_idx+input_pad_left)*16;\n"
" int dst_buffer_offset =\n"
" (((dst_bc_offset.x+dst_bc_offset.y)*output_shape.x+height_idx)*output_shape.y+width_idx)*4;\n"
" INPUT_TYPE16 values=vload16(0,input_ptr+src_buffer_offset);\n"
" \n"
" vstore4(CONVERT_OUTPUT4(values.s0123),0,output+dst_buffer_offset);\n"
" if(out_channel_block_idx+1 >= channelc4) return;\n"
" vstore4(CONVERT_OUTPUT4(values.s4567),0,output+dst_buffer_offset+width_height_size4);\n"
" if(out_channel_block_idx+2 >= channelc4) return;\n"
" vstore4(CONVERT_OUTPUT4(values.s89ab),0,output+dst_buffer_offset+2*width_height_size4);\n"
" if(out_channel_block_idx+3 >= channelc4) return;\n"
" vstore4(CONVERT_OUTPUT4(values.scdef),0,output+dst_buffer_offset+3*width_height_size4);\n"
"}\n"
;
#endif
#endif
}
